The purpose of this workflow is to compare different numbers of states specified for ChromHMM. We do this in several ways. First, we look for the maximum correlation between chromatin state and gene expression. Second, we look for state overlaps with genomic features.
This workflow can be run after 1.3_Chromatin_States_and_Expression.Rmd has been run for multiple state numbers. It uses the “Prop_Expression_Cor” files to find the maximum and minimum associations between proportion of each state and the expression of each transcript. It plots these maxima and minima across the states analyzed.
library("here")
## here() starts at /Users/atyler/Documents/Projects/Epigenetics/Epigenetics_Manuscript
library("pheatmap")
all.code.dir <- list.files(here("Code"), full.names = TRUE)
for(i in 1:length(all.code.dir)){
all.fun <- list.files(all.code.dir[i], full.names = TRUE, pattern = ".R")
for(j in 1:length(all.fun)){source(all.fun[j])}
}
is.interactive = FALSE
#is.interactive = TRUE
Collect gene expression as well as the proportion of each gene assigned to each state for all runs of ChromHMM
state.num <- list.files(here("Results", "ChromHMM"), full.names = TRUE)
state.names <- basename(state.num)
state.as.num <- as.numeric(sapply(strsplit(state.names, "_"), function(x) x[1]))
state_order <- order(state.as.num)
all.prop <- vector(mode = "list", length = length(state.num))
names(all.prop) <- basename(state.num)[state_order]
for(i in 1:length(state.num)){
prop.file <- list.files(path = state.num[state_order[i]], pattern = "Prop_full_gene_1000.RData", full.names = TRUE)
if(length(prop.file) > 0){all.prop[[i]] <- readRDS(prop.file)}
}
gene.info <- readRDS(here("Data", "RNASeq", "RNASeq_gene_info.RData"))
expr <- readRDS(here("Data", "RNASeq", "Strain_Mean_C_Expression.RData"))
scaled.expr <- readRDS(here("Data", "RNASeq", "Strain_Scaled_C_Expression.RData"))
key.file <- here("Data", "support_files", "strain.color.table.txt")
col.table <- as.matrix(read.table(key.file, sep = "\t", comment.char = "%",
stringsAsFactors = FALSE))
Fit a linear model that explains gene expression within strain with propotion chromain state.
For each run of chromHMM we end up with a fitted model for each state and for each strain. We parse these together to find the best overall number of states.
expr.mat <- t(sapply(expr, function(x) if(length(x) > 1){x}else{rep(NA, 9)}))
rownames(expr.mat) <- names(expr)
#identical(names(expr), names(all.prop[[1]]))
strain.order <- order.strains(names(expr[[1]]), colnames(all.prop[[1]][[1]]), col.table)
#colnames(all.prop[[1]][[1]])[strain.order]
ordered.states <- state.as.num[state_order]
model.fits <- vector(mode = "list", length = length(ordered.states))
names(model.fits) <- ordered.states
for(i in 1:length(ordered.states)){
states <- 1:ordered.states[i]
state.fits <- vector(mode = "list", length = length(states))
names(state.fits) <- states
for(state in states){
all.gene.state <- t(sapply(all.prop[[i]], function(x) if(length(x) > 1){x[state,strain.order]}else{rep(NA, 9)}))
state.fits[[state]] <- sapply(1:ncol(all.gene.state), function(x)
plot.with.model(rankZ(all.gene.state[,x]), rankZ(expr.mat[,x]), confidence = 0.95,
plot.results = FALSE))
}
model.fits[[i]] <- state.fits
}
Plot the fit for each state in each model with the mean 95% confidence intervals across all straints. The fits are very similar across the strains, so we average the fits across strains.
avg_effect <- lapply(model.fits, function(x) sapply(x, rowMeans))
min.fit <- -0.6
max.fit <- 0.6
for(i in 1:length(ordered.states)){
cat("###", ordered.states[i], "state model\n")
state_avg <- avg_effect[[i]]
fit.range <- max.fit - min.fit
state.col <- darken(colors.from.values(1:ordered.states[i], use.pheatmap.colors = TRUE), 1.1)
state.y <- ordered.states[i]:1
if(is.interactive){quartz(width = 5, height = 4.2)}
plot.new()
plot.window(ylim = c(0.5, (ordered.states[i]+0.5)), xlim = c(min.fit, max.fit))
abline(v = seq(min.fit, max.fit, 0.2), col = "gray80")
abline(h = state.y, col = "gray80")
#points(y = state.y, x = state_avg[3,], col = state.col, cex = 1.5, pch = 16) #fit
segments(y0 = state.y, x0 = state_avg[4,], x1 = state_avg[5,],
col = state.col, lwd = 4) #fit bar
segments(y0 = state.y-0.2, x0 = state_avg[4,], y1 = state.y+0.2,
col = state.col, lwd = 4) #lower bound
segments(y0 = state.y-0.2, x0 = state_avg[5,], y1 = state.y+0.2,
col = state.col, lwd = 4) #upper bound
axis(1)
abline(v = 0)
par(xpd = TRUE)
text(y = state.y, x = (min.fit-(fit.range*0.05)), labels = 1:ordered.states[i])
par(xpd = FALSE)
cat("\n\n")
}
## Loading required package: grid
Show all states on a single plot.
all.min <- min(sapply(avg_effect, function(x) min(x[4,])))
all.max <- max(sapply(avg_effect, function(x) max(x[5,])))
all.range <- all.max - all.min
plot.new()
plot.window(xlim = c(0, length(avg_effect)+1), ylim = c(all.min, all.max))
for(i in 1:length(avg_effect)){
effect.col <- colors.from.values(avg_effect[[i]][3,], use.pheatmap.colors = TRUE,
global.color.scale = TRUE, global.min = all.min, global.max = all.max)
x.coord <- jitter(rep(i, ncol(avg_effect[[i]])))
points(x = x.coord, y = avg_effect[[i]][3,], pch = 16,
col = effect.col)
segments(x0 = x.coord, y0 = avg_effect[[i]][5,], y1 = avg_effect[[i]][4,], col = effect.col, lwd = 4)
}
abline(h = 0)
axis(2)
par(xpd = TRUE)
text(x = 1:length(ordered.states), y = (all.min - (all.range*0.05)), labels = ordered.states)
par(xpd = FALSE)
Now we do the same thing for state effects across strains.
scaled.expr.mat <- t(sapply(scaled.expr, function(x) if(length(x) > 1){x[,1]}else{rep(NA, 9)}))
rownames(scaled.expr.mat) <- names(scaled.expr)
#identical(rownames(scaled.expr.mat), names(all.prop[[1]]))
model.across <- vector(mode = "list", length = length(ordered.states))
names(model.across) <- ordered.states
for(i in 1:length(ordered.states)){
states <- 1:ordered.states[i]
state.fits <- matrix(NA, nrow = ordered.states[i], ncol = 5)
rownames(state.fits) <- 1:ordered.states[i]
colnames(state.fits) <- c("r", "p.value", "slope.fit", "slope.lwr", "slope.upr")
for(state in states){
all.gene.state <- t(sapply(all.prop[[i]], function(x) if(length(x) > 1){x[state,strain.order]}else{rep(NA, 9)}))
scaled.gene.state <- t(apply(all.gene.state, 1, scale))
state.fits[state,] <- plot.with.model(rankZ(as.vector(scaled.gene.state)), rankZ(as.vector(scaled.expr.mat)),
confidence = 0.95, plot.results = FALSE)
}
model.across[[i]] <- state.fits
}
Plot these together.
min.fit <- -0.2
max.fit <- 0.3
for(i in 1:length(ordered.states)){
cat("###", ordered.states[i], "state model\n")
state_fit <- model.across[[i]]
fit.range <- max.fit - min.fit
state.col <- darken(colors.from.values(1:ordered.states[i], use.pheatmap.colors = TRUE), 1.1)
state.y <- ordered.states[i]:1
if(is.interactive){quartz(width = 5, height = 4.2)}
plot.new()
plot.window(ylim = c(0.5, (ordered.states[i]+0.5)), xlim = c(min.fit, max.fit))
abline(v = seq(min.fit, max.fit, 0.1), col = "gray80")
abline(h = state.y, col = "gray80")
abline(v = 0)
#points(y = state.y, x = state_fit[,3], col = state.col, cex = 1.5, pch = 16) #fit
segments(y0 = state.y, x0 = state_fit[,4], x1 = state_fit[,5],
col = state.col, lwd = 4) #fit bar
segments(y0 = state.y-0.2, x0 = state_fit[,4], y1 = state.y+0.2,
col = state.col, lwd = 4) #lower bound
segments(y0 = state.y-0.2, x0 = state_fit[,5], y1 = state.y+0.2,
col = state.col, lwd = 4) #upper bound
axis(1)
par(xpd = TRUE)
text(y = state.y, x = (min.fit-(fit.range*0.05)), labels = 1:ordered.states[i])
par(xpd = FALSE)
cat("\n\n")
}
Plot across-strain effects all on one plot.
all.min <- min(sapply(model.across, function(x) min(x[,4])))
all.max <- max(sapply(model.across, function(x) max(x[,5])))
all.range <- all.max - all.min
plot.new()
plot.window(xlim = c(0, length(model.across)+1), ylim = c(all.min, all.max))
for(i in 1:length(model.across)){
effect.col <- colors.from.values(model.across[[i]][,3], use.pheatmap.colors = TRUE,
global.color.scale = TRUE, global.min = all.min, global.max = all.max)
x.coord <- jitter(rep(i, nrow(model.across[[i]])))
points(x = x.coord, y = model.across[[i]][,3], pch = 16,
col = effect.col)
segments(x0 = x.coord, y0 = model.across[[i]][,5], y1 = model.across[[i]][,4], col = effect.col, lwd = 4)
}
abline(h = 0)
axis(2)
par(xpd = TRUE)
text(x = 1:length(ordered.states), y = (all.min - (all.range*0.05)), labels = ordered.states)
par(xpd = FALSE)
Which states across the different models have high and low effects on expression?
emissions.dir <- list.files(here("Data", "ChromHMM"), full.names = TRUE)
emissions.files <- lapply(emissions.dir,
function(x) get.files(x, want = c("emissions", ".txt"), full.names = TRUE))[state_order]
emission.mats <- lapply(emissions.files, function(x) read.table(x[1],
row.names = 1, header = TRUE, sep = "\t"))
all.state.effect <- vector(mode = "list", length = length(ordered.states))
names(all.state.effect) <- ordered.states
ordered.marks <- colnames(emission.mats[[1]])
for(i in 1:length(emission.mats)){
cat("###", ordered.states[i], "\n")
state.effects <- colMeans(sapply(model.fits[[i]], function(x) x[3,]))
effect.order <- order(state.effects, decreasing = FALSE)
mark.order <- match(ordered.marks, colnames(emission.mats[[i]]))
effect.df <- data.frame(state.effects)
ordered.emissions <- emission.mats[[i]][effect.order,mark.order]
rownames(ordered.emissions) <- effect.order
pheatmap(ordered.emissions, annotation_row = effect.df,
cluster_rows = FALSE, cluster_cols = FALSE)
all.state.effect[[i]] <- cbind(emission.mats[[i]][,mark.order], effect.df)
cat("\n\n")
}
all.effect.mat <- Reduce("rbind", all.state.effect)
all.state.names <- unlist(lapply(1:length(all.state.effect),
function(x) paste0(ordered.states[x], "_states_", rownames(all.state.effect[[x]]))))
rownames(all.effect.mat) <- all.state.names
all.emissions <- all.effect.mat[,1:4]
all.effect <- data.frame(all.effect.mat[,5])
colnames(all.effect) <- "state.effect"
rownames(all.effect) <- all.state.names
effect.order <- order(all.effect[,1], decreasing = TRUE)
#pdf("~/Desktop/all.states.pdf", height = 16, width = 4)
#pheatmap(all.emissions[effect.order,], cluster_rows = FALSE)
#pheatmap(all.emissions[effect.order,], annotation_row = all.effect, cluster_rows = FALSE)
layout.mat <- matrix(c(1,1,1,2,2,2,3,0,0), nrow = 3)
layout(layout.mat, widths = c(1, 0.1, 0.1))
par(mar = c(2,6,4,1))
imageWithText(as.matrix(all.emissions[effect.order,]), use.pheatmap.colors = TRUE, show.text = FALSE,
row.text.shift = 0.25, main = "Histone Mark Representation", main.cex = 1.5, main.shift = 0.05,
col.text.shift = -10, col.text.rotation = 0, col.text.adj = 0.5, col.text.cex = 1.5)
test.mat <- cbind(as.matrix(all.effect[effect.order,,drop=FALSE]), as.matrix(all.effect[effect.order,,drop=FALSE]))
par(mar = c(2,0.5,4,2))
imageWithText(test.mat, use.pheatmap.colors = TRUE, show.text = FALSE, row.names = NULL, col.names = NULL,
main = "Effect", main.shift = 0.05, main.cex = 1.5)
par(mar = c(2,2,6,0))
imageWithTextColorbar(test.mat, use.pheatmap.colors = TRUE, cex = 1.5, axis.line = -1.5)
#dev.off()
We next look at each set of states and how the state positions in the B6 animals overlap with B6 genomic coordinates. We used the genomic coordinate files included in ChromHMM, which includes coordinates for exons, TSS, TES, genes, and CpG islands. We downloaded additional files from the UCSC genome browser. To do that we went to [http://genome.ucsc.edu/cgi-bin/hgTables] and downloaded a few bed files by hand for different tracks.
We can use these enrichments to annotate the states we identified in ChromHMM.
enrichment.files <- list.files(here("Results", "Enrichments"),
pattern = ".txt", full.names = TRUE)[state_order]
for(i in 1:length(enrichment.files)){
cat("###", state.names[state_order[i]], "\n")
if(is.interactive){quartz(width = 8, height = 4)}
enrichment.map <- as.matrix(read.delim(enrichment.files[i], row.names = 1))
pheatmap(enrichment.map, scale = "column", cluster_rows = FALSE, cluster_cols = FALSE)
cat("\n\n")
}
pdf(here("Documents", "3.Manuscript", "3.Manuscript", "enrichment.pdf"), width = 5, height = 2.5)
pheatmap(t(enrichment.map[,c(1, 2,3,4,7,8,10,11,13)]), scale = "row", cluster_cols = FALSE)
dev.off()
We have selected the 9-state model for this analysis, so I will limit my discussion to that state here.
State 7, which has the highest positive correlation with expression localizes primarily to promoters, and transcription start sites (TSS), as well as to exons.
State 5, which is also positively correlated with expression localizes to enhancers.
State 3, which is negatively correlated with expression has similar localization to state 7, but is primarily enriched at transcription factor binding sites. It is also enriched at promoters.
State 1 is the absence of all measured marks, and is negatively correlated with expression. It is primarily associated with intergenic regions.
State 2 is also negatively correlated with expression. It is the presence of just H3K27me3. It does not strongly localize to any of the features represented here, but may be slightly more present in introns or up/downstream regulatory regions.
We used the data presented here to select the 9-state model for further analysis.